home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Disc to the Future 2
/
Disc to the Future Part II Programmer's Reference (Wayzata Technology)(6013)(1992).bin
/
MAC
/
THINKC
/
4_0
/
REXP_SRC
/
REGEXP.C
< prev
next >
Wrap
Text File
|
1989-09-18
|
16KB
|
564 lines
/*
| | Regular Expression Evaluator:
| |
| | Greg Anderson
| | 29 Kerr Hall
| | Social Sciences Computing
| | University of California, Santa Cruz
| | sirkm@ssyx
| |
| | For use with HyperCard XCMDs, XFCNs, and possibly other things.
| |
| | Make this file part of your XCMD project.
| | #include "regexp.h" in any of your files that may use routines
| | from this package.
| |
| | The regular expressions this package matches is as follows:
| |
| | c Any ordinary character 'c' not listed below matches that
| | character.
| |
| | \c A backslash (\) followed by a special character (one of
| | '.', '*', '+', '[' and '\') matches the special character
| | (i.e., the special meaning is removed).
| |
| | . A period (.) matches any single character except RETURN.
| |
| | [string] A non-empty string of characters enclosed in square
| | brackets matches any single character found in the set.
| | If the first character of such a string is ^, then
| | any single character NOT in the set is matched. '^'
| | looses its special meaning if it comes first in the
| | string.
| | The character '-' indicates a range of characters;
| | for example, [a-z] will match any lowercase letter.
| | '-' looses its special meaning if it comes first (or
| | after a leading '^') or last in the string.
| |
| | c* Any one-character regular expression followed by a *
| | matches zero or more occurances of the single character.
| | If there is any choice, the longest leftmost string
| | that matches is returned.
| |
| | c+ Like '*', but matches one or more occurances of the
| | single character regular expression.
| |
| | ^ A caret (^) at the beginning of an entire regular
| | expression constrains that regular expression to only
| | match strings found at the beginning of a line.
| |
| | $ A currency symbol ($) at the end of an entire line
| | constrains that regular expression to only match strings
| | found at the end of a line.
| |
| | The following regular expressions are NOT supported:
| |
| | \< Beginning of word
| | \> End of word
| | \( ... \) "..." is treated as a regular expression
| | \{n,m\} Repeated matches of previous regular expression.
| |
*/
#include <MacTypes.h>
#include <FileMgr.h>
#include "regexp.h"
#define TRUE 1
#define FALSE 0
#define toupper(c) ((c>='a')&&(c<='z') ? (c-('a'-'A')) : c)
int regexp_flags;
/*-----------------------------------------------------------------
| end_of_line:
|
| Checks to see if the given character pointer points at the end
| of a line.
|
| Lines end in either a return character (\r) or a null.
|
| If MULTILINE is true, then logical lines may be continued on
| multiple physical lines if succeeding physical lines are indented.
|
| If FOLDEDLINE is true, then logical lines may be continued on
| multiple physical lines by preceeding each return character with
| a backslash.
|
| If NOBREAKS is true, then there are no line breaks; the entire text
| field is treated as one long line. ^ matches only at the beginning
| of the text field, and $ matches only at the end.
|
| INPUTS: line_ptr: A pointer into the line
|
| OUTPUTS: None save the return value.
|
| RETURNS: TRUE End of line reached
| FALSE Not at the end of the line
-----------------------------------------------------------------*/
int end_of_line(line_ptr)
char *line_ptr;
{
if( !(*line_ptr) ) return(TRUE);
if( regexp_flags & NOBREAKS ) return(FALSE);
if( *line_ptr != '\r' ) return(FALSE);
if( !(*(line_ptr+1)) ) return(TRUE);
if( (regexp_flags & MULTILINE) && (*(line_ptr+1) <= ' ') )
return(FALSE);
if( (regexp_flags & FOLDEDLINE) && (*(line_ptr-1) == '\\') )
return(FALSE);
return(TRUE);
}
/*-----------------------------------------------------------------
| find_regexp:
|
| Searches for occurances of 'regexp' inside of 'line'.
|
| 'regexp' must have had some prior processing--leading ^ and
| trailing '$' should be stripped before calling. Note that
| 'greplen' will do this preprocessing.
|
| INPUTS: regexp: A pointer to the regular expression
| line: A pointer to the line to search
| start: If zero, then 'regexp' must match 'line'
| starting with the first character of 'line'.
| end: If zero, then 'regexp' must also match
| 'line' all the way to the end.
|
| OUTPUTS: start: If specified, start will be changed to
| point to the first character in 'line'
| that matched 'regexp'. If 'regexp'
| could be matched in multiple ways
| (due to wildcards), the leftmost string
| is returned.
| end: If specified, end will be changed to
| point to the first character in 'line'
| that was not part of 'regexp'. If
| 'regexp' could be matched in multiple ways
| (due to wildcards), the longest string
| that matches is selected.
|
| RETURNS: TRUE 'regexp' was found in 'line'
| FALSE 'regexp' not found--'start' and 'end' are
| invalid.
-----------------------------------------------------------------*/
int find_regexp(regexp,line,start,end)
char *regexp,
*line,
**start,
**end;
{
if( !start )
return( strgrep(regexp,line,end) );
while( !end_of_line(line) )
{
if( strgrep(regexp,line,end) )
{
*start = line;
return(TRUE);
}
++line;
}
/*
| | Special case -- searching for the end of a line and nothing else.
*/
if( !(*regexp) && !(*end) )
{
*start = line;
return(TRUE);
}
return(FALSE);
}
/*-----------------------------------------------------------------
| strgrep:
|
| Checks to see if the regular expression 'regexp' matches the
| search line provided. The match must be EXACT: 'line' is not
| searched for occurances of 'regexp', it is only checked to see
| if 'regexp' matches 'line' starting with the first character.
| ('line' may have unmatched trailing characters, however.)
|
| INPUTS: regexp: A pointer to the regular expression
| line: A pointer to the line to search
| end: If zero, then 'regexp' must also match
| 'line' all the way to the end.
|
| OUTPUTS: end: If specified, end will be changed to
| point to the first character in 'line'
| that was not part of 'regexp'. If
| 'regexp' could be matched in multiple ways
| (due to wildcards), the longest string
| that matches is selected.
-----------------------------------------------------------------*/
int strgrep(regexp,line,end)
char *regexp,
*line,
**end;
{
char *last = 0;
/*
| | Search over every character in the comparitor string
*/
while( *regexp )
{
/*
| | If we have reached the end of the line but there are
| | still characters in the regular expression, then the
| | search has probably failed.
| |
| | Wildcards in the regular expression can make things
| | a bit trickier, though.
*/
if( end_of_line(line) )
{
if( strcmp( regexp,"*" ) == 0 ) break;
if( strcmp( regexp+1,"*" ) == 0 ) break;
return(FALSE);
}
if( !chargrep(®exp,&line,&last) )
{
/*
| | The search character does not match: if the next regular
| | expression is not a '*', then the search has FAILED.
*/
if( *regexp != '*' )
return(FALSE);
else
{
/*
| | Back up the line pointer so that the same
| | character may be checked against the next
| | element in the regular expression string
*/
last = 0;
--line;
++regexp;
}
}
}
/*
| | If we are searching to the END of the line, then the input
| | line must be out of valid characters in order to return
| | a match.
*/
if( !end )
return( end_of_line(line) );
*end = line;
return(TRUE);
}
/*-----------------------------------------------------------------
| chargrep:
|
| Compares just one character in the regular expression
|
| INPUTS: All inputs are pointers to pointers to strings, as
| chargrep will advance these pointers after comparing
| them.
|
| regexp: Points into the regular expression
| line: Points into the line being searched
| last: Points at the last character checked in
| the regular expression; usually = (*regexp-1).
|
| OUTPUTS: regexp: Advanced to the next char in the reg exp.
| line: Advanced to the next char in search line
| last: Set to the initial value of 'regexp'.
-----------------------------------------------------------------*/
int chargrep(regexp,line,last)
char **regexp,
**line,
**last;
{
char c = **line,
*look = *regexp;
int match;
switch( **regexp )
{
/*
| | Set search?
*/
case '[':
*last = look;
++(*line);
return( searchset(regexp,c) );
/*
| | '.' Wildcard matches any single character except newline / return
| | c can only be a newline/return if one of the flags -m, -f or -b
| | was specified.
*/
case '.':
if( (c != '\r') && (c != '\n') )
c = '.';
break;
/*
| | Wildcards:
*/
case '*':
case '+':
/*
| | When a wild card is found, the line is scanned
| | until the last part of the regular expression
| | can be found somewhere in the line.
| |
| | If the last part of the regular expression is
| | found multiple times, the longest applicable
| | match is returned.
*/
if( !(*last) ) *last = ".";
match = wild_scan(*regexp+1,line,*last);
/*
| | Fixup for '*'-style searches.
*/
if( !match && **regexp == '*' )
match = strgrep(*regexp+1,(*line-1),line);
++(*line);
*regexp = "";
return(match);
/*
| | Backslash escape: next character interpreted literally
| |
| | Note: Should check for \nnn (octal representation)
*/
case '\\':
++(*regexp);
break;
}
/*
| | At this point, 'c' contains the character from the search
| | line that must be matched in the regular expression
| | (EXACTLY). If c does not match the regular expression,
| | then the search still will not fail if the next character
| | in the regexp is a '*'
*/
if( regexp_flags & IGNORE )
c = toupper(c);
match = (**regexp == c);
/*
| | Set 'last' = the initial value of the regular expression ptr
| | and advance the regexp and line pointers.
*/
++(*regexp);
++(*line);
*last = look;
return(match);
}
/*-----------------------------------------------------------------
| searchset:
|
| Compares a [list] in the regular expression with just one
| character in the input line.
|
| INPUTS: regexp: A pointer to a pointer into the regular
| expression
| check_c: The character to check.
|
| Enter with a pointer to a pointer into the regular expression
| Upon entry, the regexp pointer should point at the '['.
| Upon exit, it will point to the character AFTER the ']'.
|
| RETURNS: TRUE: 'check_c' was in the set
| FALSE: 'check_c' was not in the set
-----------------------------------------------------------------*/
int searchset(regexp,check_c)
char **regexp,
check_c;
{
char c, /* The char from the set */
lc = 0; /* The last char from set */
int found = 0, /* Flag: found check_c? */
invert = 0; /* Flag: inverted search */
/*
| | Advance past the '[' and check for a leading '^'
*/
++(*regexp);
c = **regexp;
if( c == '^' )
{
++invert;
++(*regexp);
c = **regexp;
}
++(*regexp);
do
{
if( regexp_flags & IGNORE )
c = toupper(c);
if( (c == '-') && lc )
{
/*
| | Check if the character lies within a range
*/
if( (lc <= check_c) && (**regexp >= check_c) )
found = 1;
lc = 0;
}
/*
| | Check if this character in the regexp list matches the
| | character being checked.
*/
else if( c == check_c )
found = 1;
lc = c;
} while( (c = *((*regexp)++) ) != ']' );
return( found ^ invert );
}
/*-----------------------------------------------------------------
| wild_scan:
|
| Regular expression wildcard handling. Searches for the last part
| of a regular expression (after a wildcard) in a line.
|
| INPUTS: regexp: A pointer to a pointer into the regular
| expression (points to the character after
| the wildcard)
| line: A pointer to a pointer into the line being
| searched (points at the character to start
| searching at)
| last: A pointer to the last character in the regexp
| before the wildcard.
|
| OUTPUTS: regexp: ALWAYS points to the null terminator at the
| end of regexp.
| line: points to the last character matched, if there
| was a match. Otherwise unchanged.
|
| RETURNS: TRUE: The pattern matched; line points to the
| first character not matched.
| FALSE: The pattern did not match.
-----------------------------------------------------------------*/
wild_scan(regexp,line,last)
char *regexp,
**line,
*last;
{
char *scan = *line,
*copy_of_last,
*dummy;
int result = FALSE;
while( !end_of_line(scan) )
{
/*
| | If the last part of the regexp is matched at the current
| | possition of 'scan', then remember that a match has been
| | found and keep scanning.
| |
| | If (and only if) regexp is found, strgrep changes 'line' to
| | point to the character after the last one matched by regexp.
*/
if( strgrep(regexp,scan,line) )
result = TRUE;
/*
| | If the character pointed to by scan does not match
| | the regexp character before the wildcard, then
| | the scan is terminated.
*/
copy_of_last = last;
if( !chargrep(©_of_last,&scan,&dummy) ) break;
}
return(result);
}
/*-----------------------------------------------------------------
| greplen:
|
| Finds the length of a grep search string. In the case of
| strings containing wild cards, returns the MINIMUM length string
| that could match the search string.
|
| greplen is also responsible for finding the occurance of ^ and $
| at the beginning and end of the string (respectively). If these
| flags are specified, greplen notes this fact & then strips them
| from the passed searchstring.
|
| If the grep search string is not valid, greplen returns -1.
-----------------------------------------------------------------*/
int greplen(searchstring)
char **searchstring;
{
char c,
*string;
int len = 0;
if( regexp_flags & IGNORE )
MakeUpper(*searchstring);
/*
| | Does the search string begin with '^'?
*/
if( **searchstring == '^' )
{
++(*searchstring);
regexp_flags |= BEGINFLAG;
}
string = *searchstring;
/*
| | Count the characters in the search string
*/
while( c = *string++ )
{
switch( c )
{
/*
| | Since '*' might match zero characters, the length of
| | the string is decremented by one, since the previous
| | character does not have to be matched.
*/
case '*':
if( len ) --len;
break;
/*
| | If a '$' is found at the end, then set the 'END' flag.
| | Otherwise, count the $ as a search character.
*/
case '$':
if( (*string) == 0 )
{
*(string-1) = 0;
regexp_flags |= ENDFLAG;
}
else
++len;
break;
/*
| | Scan through an entire [string], counting it as only
| | one character. When this loop exits, string points to
| | the ']', which will be counted in the search length on
| | the next pass of the while() loop.
*/
case '[':
if( *string++ < ' ') return(-1);
while( *string != ']' )
if( *string++ < ' ' ) return(-1);
break;
/*
| | Backslash falls through to the default case, but it
| | first advances past the character after the backslash
*/
case '\\':
if( *string++ < ' ') return(-1);
default:
++len;
}
}
return(len);
}